This is an exploratory analysis of data-set that reports child maltreatment incidents in Little rock.
First we plot the incidents on a map like a point process using the latitude and longitudes extracted by ggmap.
rm(list = ls())
require(devtools)
# devtools::install_github("dkahle/ggmap", ref = "tidyup")
# devtools::install_github("thomasp85/patchwork")
library(pacman)
p_load("sf" # Spatial data objects and methods
,"mapview" # Interactive Map Viewing
,"ggmap" # ggplot2 addon for base maps
,"cowplot"
,"spatstat" # KDE and other spatial functions
,"raster" # cell-based spatial operations
,"tidyverse" # data manipulation framework
,"Hmisc" # using cut2( functions for ggplot legends
,"fitdistrplus" # Distribution fitting functions
,"lubridate" # Power tools for handling dates
,"tidycensus"
,"lwgeom"
,"Hmisc"
,"hrbrthemes"
,"gridExtra"
,"spdep" # KNN functions
,"foreach"
,"doParallel"
,"corrplot"
,"ranger" # randomforest implimentation
,"glmnet" # for Ridge and Lasso Regression
,"knitr" # for kable table
,"kableExtra"
,"FNN" # KNN for CPS vs. NN plots
,"groupdata2"
,"htmltools"
,"viridis"
,"viridisLite")
# rm(list = ls())
setwd("~/Data/LR-Child")
crime <- read.csv(file="child-mt-accepted.csv", header=T,sep=",",
na.strings='NULL')
attach(crime)
crime$STR_NME <- crime$STR_NME %>% as.character %>% str_to_title
crime$CTY_NME <- crime$CTY_NME %>% as.character %>% str_to_title
full_add <- paste(STR_NBR,STR_NME,STR_SFX_TYP_DESC, ",",
CTY_NME,",", "AR")
head(full_add)
[1] "4511 LUDWIG , LITTLE ROCK , AR"
[2] "800 BEACH Street , NORTH LITTLE ROCK , AR"
[3] "8208 WESTWOOD Avenue , LITTLE ROCK , AR"
[4] "6838 CANTRELL Road , LITTLE ROCK , AR"
[5] "400 COLLEGE PARK , NORTH LITTLE ROCK , AR"
[6] "17809 LAWSON Road , LITTLE ROCK , AR"
library(ggmap)
register_google(key = "<your key here>")
geocoded_latlon <- geocode(crime$full_add,output="latlon")
geocoded_latlon %>% head()
crime.gc <- cbind(crime, full_add,geocoded_latlon)
write.csv(crime.gc, "LR-child-mt-accepted-geocoded.csv")
setwd("~/Data/LR-Child")
crime.gc.read <- read.csv(file="LR-child-mt-accepted-geocoded.csv",
header=T, sep=",")
tab = as.data.frame(table(crime.gc.read$CTY_NME))
cat("Number of different city names", nrow(tab))
Number of different city names 207
tab = tab %>% filter(Freq > 50)
ggplot(tab, aes(x = reorder(Var1, -Freq), y=Freq)) + geom_bar(stat="identity") +
xlab("City Names") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=10))
It appears there are many different city names in this data – what do they represent?
For this report, we filter the city names by “Little Rock” and use Little Rock boundaries to filter out addresses that are outside LR.
crime.gc.lr = crime.gc.read %>% filter(CTY_NME == "Little Rock")%>%
filter(lat >= 34.62606 & lat <= 34.82195) %>%
filter(lon >= -92.52091 & lon <= -92.15494)
nrow(crime.gc.lr)
[1] 5136
qmplot(lon, lat, data = crime.gc.lr)
qmplot(lon, lat, data = crime.gc.lr, geom = "point", maptype = "toner-lite", darken = .5, color = I("red"), legend = "topleft") +
stat_density_2d(aes(fill = ..level..), geom = "polygon", alpha = .5, color = NA) +
scale_fill_gradient2("Propensity", low = "white", mid = "yellow", high = "red", midpoint = 60)
library(lubridate)
# str(crime.gc)
crime.gc.lr$Incident.Date <- ifelse(as.character(crime.gc.lr$Incident.Date) == "",as.character(crime.gc.lr$Referral.Date),as.character(crime.gc.lr$Incident.Date))
crime.gc.lr$Incident.Date <- as.Date(crime.gc.lr$Incident.Date, format = "%m/%d/%Y" )
crime.gc.lr$IncidentYear <- lubridate::year(ymd(crime.gc.lr$Incident.Date))
crime.gc.lr$IncidentWeek <- lubridate::week(ymd(crime.gc.lr$Incident.Date))
library(dplyr)
crime.yearly.ct <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(IncidentYear) %>% dplyr::summarise(count = n())
crime.weekly.ct <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(IncidentYear, IncidentWeek ) %>%
dplyr::summarise(count = n())
The following figure shows the weekly incidence counts by years, and the data for 2018 stops at the exact same point where 2015 begins, the month of June.
library(ggplot2)
(crime.plot <- ggplot(crime.weekly.ct, aes(x = IncidentWeek, y = count, group = as.factor(IncidentYear), colour = as.factor(IncidentYear)))+
geom_line(stat = "identity")+geom_bar(alpha = 0.2,stat = "identity")+ylab("Incident Count")+
xlab("Weeks") +
facet_grid(IncidentYear~.,scales="free_y")+
theme(legend.position="right"))
(crime.plot <- ggplot(crime.yearly.ct, aes(x = IncidentYear, y = count))+
geom_bar(stat="identity") +ylab("Incident Count")+
xlab("Years") +labs(title = "Yearly Counts") +
theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))+
theme(legend.position="right"))
This animation shows how the incidents moved over time - the animation shows the points on the map of LR over years from 2015 to 2018.
library(gganimate)
crime.ct.yr <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(lon, lat, IncidentYear) %>%
dplyr::summarise(count = n())
p <- qmplot(lon, lat, data = crime.ct.yr, maptype = "toner-lite", color = I("red"))
p<- p + geom_point(aes(x = lon, y = lat, size = count),
data = crime.ct.yr, colour = 'purple', alpha = .7) +
labs(title = 'Year: {frame_time}', x = 'lon', y = 'lat') +
transition_time(IncidentYear)
animate(p, fps = 1, nframes = 4)
Frequent Pattern Mining: Frequent Itemset Mining: - Applications: Market Basket Analysis / Click stream Analysis / Web Link Analysis.
Frequent Item Set Mining is a method for market basket analysis.
It aims at finding regularities in the shopping behavior of customers of supermarkets, mail-order companies, on-line shops etc.
More specifically: Find sets of products that are frequently bought together.
Possible applications of found frequent item sets:
Often found patterns are expressed as association rules, for example: If a customer buys bread and wine, then she/he will probably also buy cheese.
Building blocks problem shares some similarity with market basket analysis:
Mutual exclusivity: redundant to buy both multiple items of the same type but in the general population will buy from this category. - e.g. one customer is not likely to buy both Coke and Pepsi but a majority of the general population of soda drinkers rely on brown soda, - {Coke, Pepsi} is the building block for the soda requirement.
Formally, our goal here is to identify “building blocks” for child maltreatment from the binary incidence variable, observed as an \(m \times n\) sparse binary matrix, where \(m,n\) denote the number of observations and number of different allegations/events respectively. The idea here is to think of this data as a transaction data, where each sample represent a transaction and each gene an item. In the language of `itemset mining’, we have an item set \(I\) and a trasaction set \(D\), and each transaction in \(D\) contains a subset of the items in \(I\). In our case, each subject might have 1’s on a subset of the list of columns.
First, we define some preliminaries: a rule $ X Y$ implies \(X, Y \in I\) and \(X \cap Y = \phi\), where \(X\) is called RHS or ‘antecedent’, and ‘Y’ is called ‘consequent’. The support \(supp(X)\) of a itemset (think, gene-set) \(X\) is defined as the proportion of transactions (think, samples) in the data set which contain the itemset. The confidence of a rule is defined \(conf(X \Rightarrow Y ) = supp(X \cup Y )/supp(X)\), and mimics the conditional probability \(P(Y | X)\). A third interest measure lift is defined as \(lift(X \Rightarrow Y) = supp(X \cup Y )/(supp(X)\times supp(Y))\), which gives deviation of the support of the rule \(X \Rightarrow Y\) from the support expected under independence - i.e. higher lift means higher association.
An association rules is a rule that surpasses a user-specified minimum support and minimum confidence threshold, and to select the interesting association rules we impose further filter the rules (or rank them) by an additional interest measure - e.g. lift. There are other measures of interest such as Chi-square measure, conviction and leverage but we will skip them for the sake of simplicity.
Now, we use two R-packages arules and arulesViz to find a few “interesting” association rules (building blocks) and plot them. We first load the packages and create the data-sets from the raw data.
To see which items are important in the data set we can use the itemFrequencyPlot. To reduce the number of items, we only plot the item frequency for items with a support greater than 5% (using the parameter support). For better readability of the labels, we reduce the label size with the parameter cex.names.
library(arules)
library(arulesViz)
crime.lr <- as(as.matrix(crime.gc.lr[,10:130]),"transactions")
itemFrequencyPlot(crime.lr, support = 0.05)
Next, we call the function apriori() to find all rules (the default association type for apriori()) with a minimum support of 1% and a confidence of 0.5. The summary() function gives an overview of the association rules found using this function.
rules <- apriori(crime.lr, parameter = list(support = 0.06, confidence = 0.9,
minlen = 1))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.9 0.1 1 none FALSE TRUE 5 0.06 1
maxlen target ext
10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 308
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[112 item(s), 5136 transaction(s)] done [0.00s].
sorting and recoding items ... [15 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [11 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(rules)
set of 11 rules
rule length distribution (lhs + rhs):sizes
2
11
Min. 1st Qu. Median Mean 3rd Qu. Max.
2 2 2 2 2 2
summary of quality measures:
support confidence lift count
Min. :0.06094 Min. :1 Min. :1.733 Min. : 313.0
1st Qu.:0.06532 1st Qu.:1 1st Qu.:1.733 1st Qu.: 335.5
Median :0.09404 Median :1 Median :1.733 Median : 483.0
Mean :0.10141 Mean :1 Mean :2.454 Mean : 520.8
3rd Qu.:0.10514 3rd Qu.:1 3rd Qu.:2.545 3rd Qu.: 540.0
Max. :0.22118 Max. :1 Max. :7.234 Max. :1136.0
mining info:
data ntransactions support confidence
crime.lr 5136 0.06 0.9
kable(inspect(head(sort(rules, by ="lift"),20)),digits=4)
lhs rhs support confidence lift count
[1] {Sexual.Contact} => {Broad.Alleg...Sexual.Abuse} 0.10085670 1 7.233803 518
[2] {Striking.Child.with.Closed.Fist} => {Broad.Alleg...Abuse} 0.06094237 1 2.545094 313
[3] {Striking.child..7.18yrs..on.face.head} => {Broad.Alleg...Abuse} 0.10942368 1 2.545094 562
[4] {Cuts..Bruises..Welts} => {Broad.Alleg...Abuse} 0.15342679 1 2.545094 788
[5] {Medical.Neglect} => {Broad.Alleg...Neglect} 0.06736760 1 1.732794 346
[6] {Educational.Neglect} => {Broad.Alleg...Neglect} 0.08450156 1 1.732794 434
[7] {Inadequate.Food} => {Broad.Alleg...Neglect} 0.06327882 1 1.732794 325
[8] {Failure.to.Protect} => {Broad.Alleg...Neglect} 0.06172118 1 1.732794 317
[9] {Environmental.Neglect} => {Broad.Alleg...Neglect} 0.09871495 1 1.732794 507
[10] {Broad.Alleg...Neglect..True.} => {Broad.Alleg...Neglect} 0.09404206 1 1.732794 483
[11] {Inadequate.Supervision} => {Broad.Alleg...Neglect} 0.22118380 1 1.732794 1136
One way of identifying the interesting association rules is to look at association rules with known “interesting” genes, and study their association strengths or measures of interest, such as, lift, confidence, support etc.
interestMeasure(rules, c("support", "chiSquare", "confidence", "coverage", "lift"),crime.lr)
support chiSquared confidence coverage lift
1 0.06736760 271.8612 1 0.06736760 1.732794
2 0.08450156 347.3871 1 0.08450156 1.732794
3 0.06327882 254.2463 1 0.06327882 1.732794
4 0.06172118 247.5762 1 0.06172118 1.732794
5 0.06094237 514.9998 1 0.06094237 2.545094
6 0.09871495 412.2184 1 0.09871495 1.732794
7 0.10085670 3591.3184 1 0.10085670 7.233803
8 0.10942368 975.0348 1 0.10942368 2.545094
9 0.09404206 390.6796 1 0.09404206 1.732794
10 0.15342679 1438.1913 1 0.15342679 2.545094
11 0.22118380 1068.8702 1 0.22118380 1.732794
Here we show a few plots to understand these rules better.
Recall that for a rule \(X \Rightarrow Y\), X is called antecedent and Y is called consequent
This shows the rules (or itemsets) as a graph with items as labeled vertices, and rules (or itemsets) represented as vertices connected to items using arrows. For rules, the LHS items are connected with arrows pointing to the vertex representing the rule and the RHS has an arrow pointing to the item.
plot(rules, method="graph", engine = "htmlwidget")
We can also do a basic scatterplot of the rules mined earlier by the apriori() function by their support (x-axis) and confidence (y-axis) and the shading of the points (rules) increases with their lift. The interesting rules should have high support, high confidence and high lift.
plot(rules, measure=c("support", "lift"), shading="confidence")
The Matrix-based visualization techniques organize the antecedent and consequent itemsets on the x and y-axes, respectively. A selected interest measure is displayed at the intersection of the antecedent and consequent of a given rule. If no rule is available for a antecedent/consequent combination the intersection area is left blank.
Next, we plot the filtered rules using the matrix visualization. The color shading of the sqaures represent the interest measure, lift. The plot doesn’t display the long labels on the axis but prints out the antecedent and consequent on the terminal. The last argument is needed to create a tidy figure by putting rules with similar lift together.
plot(rules, method="matrix", measure="lift", control=list(reorder="support/confidence"))
Itemsets in Antecedent (LHS)
[1] "{Inadequate.Supervision}"
[2] "{Cuts..Bruises..Welts}"
[3] "{Striking.child..7.18yrs..on.face.head}"
[4] "{Sexual.Contact}"
[5] "{Environmental.Neglect}"
[6] "{Broad.Alleg...Neglect..True.}"
[7] "{Educational.Neglect}"
[8] "{Medical.Neglect}"
[9] "{Inadequate.Food}"
[10] "{Failure.to.Protect}"
[11] "{Striking.Child.with.Closed.Fist}"
Itemsets in Consequent (RHS)
[1] "{Broad.Alleg...Neglect}" "{Broad.Alleg...Abuse}"
[3] "{Broad.Alleg...Sexual.Abuse}"